library(tidyverse)
## ─ Attaching packages ───────────────────── tidyverse 1.2.1 ─
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ─ Conflicts ─────────────────────── tidyverse_conflicts() ─
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(dplyr)
library(stringr)
library(readr)
library(readxl)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(maps)
##
## Attaching package: 'maps'
## The following object is masked from 'package:purrr':
##
## map
library(maps)
Read original data
heart_disease_stratify = read_csv("data/Heart_Disease_Mortality_Data_Among_US_Adults__35___by_State_Territory_and_County.csv") %>%
janitor::clean_names() %>%
rename(state = location_abbr) %>%
rename(mortality_rate = data_value) %>%
mutate(state = state.name[match(state, state.abb)]) %>%
select(-data_source, -geographic_level, -class, -topic, -data_value_footnote, -data_value_footnote_symbol, -topic_id, -location_id )
## Parsed with column specification:
## cols(
## Year = col_integer(),
## LocationAbbr = col_character(),
## LocationDesc = col_character(),
## GeographicLevel = col_character(),
## DataSource = col_character(),
## Class = col_character(),
## Topic = col_character(),
## Data_Value = col_double(),
## Data_Value_Unit = col_character(),
## Data_Value_Type = col_character(),
## Data_Value_Footnote_Symbol = col_character(),
## Data_Value_Footnote = col_character(),
## StratificationCategory1 = col_character(),
## Stratification1 = col_character(),
## StratificationCategory2 = col_character(),
## Stratification2 = col_character(),
## TopicID = col_character(),
## LocationID = col_character(),
## `Location 1` = col_character()
## )
heart_disease = heart_disease_stratify %>%
filter(stratification1 == "Overall", stratification2 == "Overall") %>%
select(-stratification1, -stratification2, -stratification_category1, -stratification_category2)
heart_disease$mortality_rate[is.na(heart_disease$mortality_rate)] = 0
heart_disease =
heart_disease %>%
group_by(state) %>%
summarise(mortality_rate = mean(mortality_rate))
Add air quality data
airquality_2015 = read_csv("data/airquality.csv") %>%
janitor::clean_names() %>%
select(state, pm2_5) %>%
group_by(state) %>%
summarize(pm2.5 = sum(pm2_5))
## Parsed with column specification:
## cols(
## State = col_character(),
## County = col_character(),
## Year = col_integer(),
## `Days with AQI` = col_integer(),
## `Good Days` = col_integer(),
## `Moderate Days` = col_integer(),
## `Unhealthy for Sensitive Groups Days` = col_integer(),
## `Unhealthy Days` = col_integer(),
## `Very Unhealthy Days` = col_integer(),
## `Hazardous Days` = col_integer(),
## `Max AQI` = col_integer(),
## `90th Percentile AQI` = col_integer(),
## `Median AQI` = col_integer(),
## `Days CO` = col_integer(),
## `Days NO2` = col_integer(),
## `Days Ozone` = col_integer(),
## `Days SO2` = col_integer(),
## PM2.5 = col_integer(),
## `Days PM10` = col_integer()
## )
Add obesity data
obesity_data = read_csv("data/National_Obesity_By_State.csv") %>%
janitor::clean_names() %>%
rename(state = name) %>%
rename(obesity_percentage = obesity) %>%
select(state, obesity_percentage)
## Parsed with column specification:
## cols(
## OBJECTID = col_integer(),
## NAME = col_character(),
## Obesity = col_double(),
## Shape__Area = col_double(),
## Shape__Length = col_double()
## )
data_with_obesity = left_join(heart_disease, obesity_data)
## Joining, by = "state"
Add stroke data
stroke_data = read_csv("data/Stroke_Mortality_Data_Among_US_Adults__35___by_State_Territory_and_County.csv") %>%
janitor::clean_names() %>%
rename(stroke_value=data_value)%>%
rename(state = location_abbr) %>%
mutate(state = state.name[match(state, state.abb)])%>%
select(state,stroke_value) %>%
group_by(state) %>%
filter(!is.na(stroke_value)) %>%
summarize(stroke_value = sum(stroke_value))
## Parsed with column specification:
## cols(
## Year = col_integer(),
## LocationAbbr = col_character(),
## LocationDesc = col_character(),
## GeographicLevel = col_character(),
## DataSource = col_character(),
## Class = col_character(),
## Topic = col_character(),
## Data_Value = col_double(),
## Data_Value_Unit = col_character(),
## Data_Value_Type = col_character(),
## Data_Value_Footnote_Symbol = col_character(),
## Data_Value_Footnote = col_character(),
## StratificationCategory1 = col_character(),
## Stratification1 = col_character(),
## StratificationCategory2 = col_character(),
## Stratification2 = col_character(),
## TopicID = col_character(),
## LocationID = col_character(),
## `Location 1` = col_character()
## )
Add income
income_data = read_excel("data/income_2015.xlsx", range = "A4:D55") %>%
janitor::clean_names() %>%
rename(state = united_states, median_income = x55117, income_standard_error = x253)
data_with_income = left_join(heart_disease,income_data, by = "state")
data_income_obesity = left_join(income_data,data_with_obesity, by = "state")
smoking_data = read_csv("data/smoking.csv") %>%
filter(YEAR == "2015-2016") %>%
mutate(year = 2015) %>%
rename(state = LocationDesc) %>%
select(-YEAR) %>%
select(year, state, Data_Value) %>%
select(year, state, Data_Value) %>%
rename(tobacco_comsumption = Data_Value) %>%
group_by(state) %>%
summarise(tobacco_consumption = sum(tobacco_comsumption))
## Parsed with column specification:
## cols(
## .default = col_character(),
## Data_Value = col_double(),
## Data_Value_Std_Err = col_double(),
## Low_Confidence_Limit = col_double(),
## High_Confidence_Limit = col_double(),
## Sample_Size = col_integer(),
## DisplayOrder = col_integer()
## )
## See spec(...) for full column specifications.
data_income_obesity_smoking = left_join(smoking_data, data_income_obesity, by = "state")
data_income_obesity_smoking_air = left_join(airquality_2015, data_income_obesity_smoking, by = "state")
data_income_obesity_smoking = left_join(smoking_data, data_income_obesity, by = "state")
data_income_obesity_smoking_air = left_join(airquality_2015, data_income_obesity_smoking, by = "state")
final_data = left_join(stroke_data, data_income_obesity_smoking_air, by = "state")
Find the association between smoke and heart disease mortality
final_data %>%
mutate(state = forcats::fct_reorder(factor(state), tobacco_consumption)) %>%
ggplot(aes(x = mortality_rate, y = tobacco_consumption)) +
geom_point(aes(color = state), alpha = .5) +
labs(
title = "Tabacco Consumption Accross states"
) +
theme(text = element_text(size = 8), axis.text = element_text(angle = 60, hjust = 1), legend.position = "bottom")
## Warning: Removed 14 rows containing missing values (geom_point).
lm(mortality_rate~tobacco_consumption, data = final_data) %>%
summary()
##
## Call:
## lm(formula = mortality_rate ~ tobacco_consumption, data = final_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -124.235 -46.749 0.038 40.108 115.435
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 241.8406 45.2029 5.350 5.56e-06 ***
## tobacco_consumption 0.8384 0.3669 2.285 0.0285 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 62.37 on 35 degrees of freedom
## (14 observations deleted due to missingness)
## Multiple R-squared: 0.1298, Adjusted R-squared: 0.105
## F-statistic: 5.222 on 1 and 35 DF, p-value: 0.02848
Find the association between income and heart disease mortality
final_data_income =
final_data %>%
mutate(state = forcats::fct_reorder(factor(state), median_income))
final_data_income %>%
ggplot(aes(x = mortality_rate, y = median_income, color = state)) +
geom_point() +
theme(text = element_text(size = 8), axis.text.x = element_text(angle = 60, hjust = 1), legend.position = "bottom") +
#Add the title and the name for x and y axis.
labs(
title = "Association between Income and Heart Disease Mortality Rate",
x = "Mortality Rate",
y = "median_income"
)
## Warning: Removed 1 rows containing missing values (geom_point).
lm(mortality_rate ~ median_income, data = final_data_income) %>%
summary()
##
## Call:
## lm(formula = mortality_rate ~ median_income, data = final_data_income)
##
## Residuals:
## Min 1Q Median 3Q Max
## -80.924 -29.753 -7.434 28.817 102.081
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.102e+02 3.994e+01 15.279 < 2e-16 ***
## median_income -4.826e-03 7.058e-04 -6.838 1.3e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.39 on 48 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.4935, Adjusted R-squared: 0.4829
## F-statistic: 46.76 on 1 and 48 DF, p-value: 1.303e-08
From the lm result, we can observe that median_income is a very significant variable with a p value of 1.3e-08. This indicates there is a strong association between income and heart disease mortality rate
Find the association between airquality and heart disease mortality
## make scatterplot
final_data %>%
mutate(state = fct_reorder(state, mortality_rate)) %>%
ggplot(aes(x = mortality_rate, y = pm2.5, color = state)) +
geom_point() +
ggtitle("Airquality VS Mortality Rate ") +
theme(legend.position = "bottom",
legend.direction = "horizontal",
axis.text.x = element_text(angle = 90, size = 6),
legend.key.size = unit(0.05, "cm")) +
labs(x = "Mortality Rate",
y = "PM2.5")
## Warning: Removed 1 rows containing missing values (geom_point).
## fit simple linear regression model
air_regression<-lm(final_data$mortality_rate~final_data$pm2.5)
summary(air_regression)
##
## Call:
## lm(formula = final_data$mortality_rate ~ final_data$pm2.5)
##
## Residuals:
## Min 1Q Median 3Q Max
## -89.08 -38.02 -15.49 34.66 139.55
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.382e+02 1.427e+01 23.698 <2e-16 ***
## final_data$pm2.5 9.734e-04 4.673e-03 0.208 0.836
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 62.34 on 48 degrees of freedom
## (1 observation deleted due to missingness)
## Multiple R-squared: 0.0009031, Adjusted R-squared: -0.01991
## F-statistic: 0.04339 on 1 and 48 DF, p-value: 0.8359
From the scatterplot, we can see that the points are spread randomlly. However, the relationship between pm2.5 and mortality rate is unclear. For the states, with low pm2.5, some of them have low mortality rate and some of them have high mortality rate. After we fit the simple regression model, the p-value for pm2.5 is 0.836, so it is a non-significant variable.
library(plotly)
map_data = final_data %>%
mutate(state = tolower(state))
states <- map_data("state") %>%
rename(state = region)
a = left_join(states, map_data, by = "state") %>%
mutate(text_label = str_c("Region: ", state, 'Mortality rate: ', mortality_rate) )
a$text_label <- with(a, paste(state, '<br>', "Mortality_rate", mortality_rate))
# give state boundaries a white border
l <- list(color = toRGB("white"), width = 2)
# specify some map projection/options
g <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
p <- plot_geo(a, locationmode = 'USA-states') %>%
add_trace(
z = ~mortality_rate, text = ~text_label, locations = ~us,
color = ~mortality_rate, colors = 'Purples'
) %>%
colorbar(title = "Millions USD") %>%
layout(
title = '2011 US Agriculture Exports by State<br>(Hover for breakdown)',
geo = g
)
## Warning: Ignoring 10 observations
p